import os
import re

new_annos = []
cleaned_new_annos = []

if os.path.exists("./barbara.list"):
    with open("./barbara.list", 'r', encoding='utf-8') as f:
        long_character_anno = f.readlines()
        new_annos += long_character_anno
else:
    print('barbara.list cannot be found, please confirm that the path is correct')
    exit()

for line in new_annos:
    path, name, lang, text = line.split("|")
    text += "\n" if not text.endswith("\n") else ""
    if len(text) >= 5:
        my_re = re.compile(r'[A-Za-z]', re.S)
        res = re.findall(my_re, text)
        if len(res):
            print(f'Skip non-kanji text : {text}')
        else:
            cleaned_new_annos.append(path + "|" + name + "|" + lang+ "|" + text)
    else:
        print(f'skip too short wav : {text}')

save_path = os.path.join("/mnt/workspace/Bert-VITS2/Data/train/filelists", "clean_barbara.list")
with open(save_path, 'w', encoding='utf-8') as f:
    for line in cleaned_new_annos:
        f.write(line)

print('完成数据集清晰! 标注文件（clean_barbara.list）已存放在', save_path)
# 删除long_character_anno.txt文件
if os.path.exists("./long_character_anno.txt"):
    os.remove("./long_character_anno.txt")
    print("long_character_anno.txt已成功删除")
else:
    print("long_character_anno.txt不存在")

# 删除barbara.list文件
if os.path.exists("./barbara.list"):
    os.remove("./barbara.list")
    print("barbara.list已成功删除")
else:
    print("barbara.list不存在")